8 Differential Expression

We then compare the protein expression between strains for each pairwise comparison (28 unique pairwise combinations)

To detect differentially expressed genes, we use the limma analysis on normalized protein expression.

8.1 Volcano plots

First, we can look at the volcano plots (log-foldchange vs qvalue) for each unique pairwise comparison.

#ind_na_rows  = find_na_rows(int_norm,as.indices = T)
df_imputed = tibble( uniprot = int_norm$uniprot,
                     is_imputed = (rowSums( is.na(int_norm))>0)* 1, 
                     imputed = factor(is_imputed, levels = c(0,1), labels = c("not", "is_imputed")))
#int_norm_bpca$imputed = rowSums( is.na(int_norm))>0
volcPlot(INPUT=int_norm_bpca, IMPUTED=df_imputed,MIN_LFC=2, MIN_PVAL=0.01, WHICH='both', TOPN = 20)
#> Joining, by = c("ID", "pValue", "qValue", "EffectSize",
#> "comparison", "sig", "log10_qvalue", "SGD", "ORF",
#> "UNIPROT", "GENENAME", "is_imputed", "imputed")

8.2 Differentially expressed genes

# Without NA
volcano_data =  get_volcano_data(input_data=int_norm_nona, which='both',
                            min_lfc=2, min_pval=0.01, topn = 20)
df_limma = bind_rows(volcano_data) %>% as_tibble()
dfe_nona = subset(df_limma,sig!='Non significant')
N_DFE_NONA = nrow(dfe_nona)
NPROT_DFE_NONA =  n_distinct(dfe_nona$ID)

# With imputed expression to replace NA
dfe =  get_volcano_data(input_data=int_norm_bpca, which='both',
                            min_lfc=2, min_pval=0.01, topn = 20) %>% 
                bind_rows %>% as_tibble() %>% filter(sig!='Non significant')

N_DFE_IMPUTE = nrow(dfe)
NPROT_DFE_IMPUTE =  n_distinct(dfe$ID)

down = dfe %>% group_by(ID) %>% dplyr::filter(sig=='Downregulated') %>%
       summarize( strains_down = paste0(comparison,collapse=' '),
                  down_AMH = str_count(strains_down,'AMH-'),
                  down_BAN = str_count(strains_down,'BAN-'),
                  down_BED = str_count(strains_down,'BED-'),
                  down_BPL = str_count(strains_down,'BPL-'),
                  down_BTT = str_count(strains_down,'BTT-'),
                  down_CMP = str_count(strains_down,'CMP-'),
                  down_CPI = str_count(strains_down,'CPI-'),
                  down_CQC = str_count(strains_down,'CQC-'))
up = dfe %>% group_by(ID) %>% dplyr::filter(sig=='Upregulated') %>%  
      summarize( strains_up = paste0(comparison,collapse=' '),
                  up_AMH = str_count(strains_up,'AMH-'),
                  up_BAN = str_count(strains_up,'BAN-'),
                  up_BED = str_count(strains_up,'BED-'),
                  up_BPL = str_count(strains_up,'BPL-'),
                  up_BTT = str_count(strains_up,'BTT-'),
                  up_CMP = str_count(strains_up,'CMP-'),
                  up_CPI = str_count(strains_up,'CPI-'),
                  up_CQC = str_count(strains_up,'CQC-'))
# get_dfe(int_norm, MIN_LFC=2, MIN_PVAL=0.01,  WHICH='both', TOPN = 20) %>% remove_rownames() %>% 
#   dplyr::left_join(sc_identifiers, by=c('ID'='UNIPROT'))

# Number of times a hit is differentially expressed
df_dfe = dfe %>% left_join(janitor::tabyl(dfe,ID,sig)) %>%
         left_join(down) %>% left_join(up) %>%
         rename(uniprot=ID) %>% 
         group_by(uniprot,comparison) %>% 
        mutate( n_strains_up = sum(c_across(starts_with('up_')) !=0 ),
                n_strains_down = sum(c_across(starts_with('down_'))!=0)) %>%
        replace_na(list(n_strains_up=0,n_strains_down=0)) %>%
        left_join(evo_yeast, by=c('uniprot'='UNIPROT')) 
#> Joining, by = "ID"
#> Joining, by = "ID"
#> Joining, by = "ID"
  

df_dfe_annot = df_dfe %>%
          left_join(sc_annotation_orf,by=c('uniprot'='UNIPROT')) %>%
  mutate(uniprot_link = paste0("<a href='https://www.uniprot.org/uniprot/",uniprot,"'>",uniprot,"</a>"),
         sgd_link = paste0("<a href='https://www.yeastgenome.org/locus/",SGD,"'>",SGD,"</a>"),
         regulated = Downregulated+Upregulated) %>% 
  dplyr::relocate(uniprot,uniprot_link,sgd_link,regulated,Downregulated,Upregulated, 
                  GENENAME,ORF,PNAME,'FUNCTION','BIOPROCESS_all','ORTHO','OTHER')

We get 225 genes differentially expressed when excluding genes with missing expression in any samples.

On average, each gene is detected in 3.4888889 unique pairwise strain comparison.

After imputation of missing expression with bpca (Bayesian missing value imputation), we get 230 more genes differentially expressed (n=455).

On average, each gene is detected in 3.6351648 unique pairwise strain comparison.

The following table shows the list of differentially expressed genes across all unique pairwise comparison, with annotations data and conservation/snp information.

library(kableExtra)
library(formattable)
library(DT)

ft_dt = df_dfe_annot %>% 
  formattable(
    list(
      `Downregulated` = color_tile("white", "red"),
      `Upregulated` = color_tile("white", "blue"),
      `regulated` = color_tile("white", "gray")
    )
) %>% as.datatable(
        options = list(
            fixedHeader=T,
            paging = TRUE, pageLength = 20,  ## paginate the output and #rows for each page
            scrollY = TRUE,   ## enable scrolling on X/Y axis
            #autoWidth = TRUE, ## use smart column width handling
            server = FALSE,   ## use client-side processing
            dom = 'Bfrtip', buttons = list('csv', 'excel', list(extend = 'colvis')),
            fixedColumns = list(leftColumns = 1),
            columnDefs = list(list(width = '50px', visible=TRUE, targets = "_all"))
          ),
  extensions = c('FixedHeader','FixedColumns','Buttons'),
  selection = 'single',           ## enable selection of a single row
  filter = 'bottom',              ## include column filters at the bottom
  rownames = FALSE,               ## don't show row numbers/names
  width = NULL, 
  height = NULL,
  caption = NULL
) %>% 
   formatStyle(columns = 1:30, target= 'row',lineHeight='100%', `font-size` = '12px')

ft_dt

8.3 Heatmap of expression for differentially expressed genes


dat_scaled = int_scaled_strains %>% as.data.frame() %>% rownames_to_column('uniprot') 

dfe_exp = dat_scaled %>% dplyr::filter( uniprot %in% dfe$ID) %>% 
          left_join(sc_identifiers,by=c('uniprot'='UNIPROT'))  %>% 
          dplyr::filter(!duplicated(GENENAME)) %>%
          mutate(GENENAME=if_na(GENENAME,uniprot))%>%
          column_to_rownames(var = 'GENENAME') %>%
          dplyr::select(-ORF,-uniprot,-SGD)

p_dfe_exp=pheatmap::pheatmap(dfe_exp,
                             fontsize = 5,cellwidth = 5,cellheight =5,border_color = NA,treeheight_col = 10,
                             filename = here('plot','heatmap-exp.pdf'))
knitr::include_graphics(here('plot','heatmap-exp.pdf'))

8.4 Heatmap of expression differences



dfe_lfc = get_volcano_data(input_data=int_norm_bpca, which='both',min_lfc=2, min_pval=0.01, topn = 20) %>% 
          bind_rows %>% as_tibble() %>%
          pivot_wider(id_cols=ID, names_from = 'comparison', values_from = 'EffectSize') %>% 
          dplyr::filter(ID %in% dfe$ID) %>% 
          left_join(sc_identifiers,by=c('ID'='UNIPROT')) %>%
          dplyr::filter(!duplicated(GENENAME)) %>%
          mutate(GENENAME=if_na(GENENAME,ID))%>%
          column_to_rownames('GENENAME')

dfe_lfc_mat = dfe_lfc %>% dplyr::select(-ORF,-ID,-SGD)
# Heatmap of differentially expressed genes
pheatmap::pheatmap(dfe_lfc_mat,fontsize = 5,cutree_rows = 10,cellwidth = 5,cellheight =5,border_color = NA,
                          treeheight_col = 10, filename = here('plot','heatmap-lfc.pdf')  )
knitr::include_graphics(here('plot','heatmap-lfc.pdf'))

8.5 Cluster genes based on their profile of differential expression strain-pairwise

# Transpose the matrix to calculate distance between experiments row-wise
d_strain <- dfe_lfc_mat %>% t() %>% dist(.,method = "euclidean", diag = FALSE, upper = FALSE)
# Calculate the distance between proteins row-wise 
d_prot <- dfe_lfc_mat %>% dist(.,method = "euclidean", diag = FALSE, upper= FALSE)

hc_prot = hclust(d_prot,method='ward.D2')
cl_prot = hc_prot %>% cutree(k=10)
library(dendextend)
#> 
#> ---------------------
#> Welcome to dendextend version 1.15.2
#> Type citation('dendextend') for how to cite the package.
#> 
#> Type browseVignettes(package = 'dendextend') for the package vignette.
#> The github page is: https://github.com/talgalili/dendextend/
#> 
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
#> You may ask questions at stackoverflow, use the r and dendextend tags: 
#>   https://stackoverflow.com/questions/tagged/dendextend
#> 
#>  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
#> ---------------------
#> 
#> Attaching package: 'dendextend'
#> The following objects are masked from 'package:ape':
#> 
#>     ladderize, rotate
#> The following object is masked from 'package:data.table':
#> 
#>     set
#> The following object is masked from 'package:ggtree':
#> 
#>     rotate
#> The following object is masked from 'package:ggpubr':
#> 
#>     rotate
#> The following object is masked from 'package:Biostrings':
#> 
#>     nnodes
#> The following object is masked from 'package:stats':
#> 
#>     cutree
dend_prot = as.dendrogram(hc_prot)
dend_prot = rotate(dend_prot,seq_along(cl_prot))
dend_prot <- color_branches(dend_prot, k=10)
dend_prot <- color_labels(dend_prot, k=10)

#labels_colors(dend) <-rainbow_hcl(3)[sort_levels_values( as.numeric(dend_prot[,5])[order.dendrogram(dend)] )]
dend_prot <- hang.dendrogram(dend_prot,hang_height=0.1)
dend_prot <- set(dend_prot, "labels_cex", 0.5)
# And plot:
par(mar = c(2,3,0,3))
plot(dend_prot,horiz =  TRUE,  nodePar = list(cex = .007))

#legend("topleft", legend = iris_species, fill = rainbow_hcl(10))
par(mar = rep(0,4))
circlize_dendrogram(dend_prot)
#> Loading required namespace: circlize

8.6 Functional map for differentially expressed genes

library(treemap)
library(d3Tree)
treemap(df_dfe_annot, index=c("BIOPROCESS_all", "comparison"), vSize='regulated', vColor="EffectSize", type="value") 
treemap(df_dfe_annot, index=c("BIOPROCESS_all", "GENENAME"), vSize="regulated", vColor="EffectSize", type="value",)

8.7 Functional enrichment for differentially expressed genes

library(enrichplot)
#> 
#> 
#> Attaching package: 'enrichplot'
#> The following object is masked from 'package:GGally':
#> 
#>     ggtable
#> The following object is masked from 'package:ggpubr':
#> 
#>     color_palette
library(clusterProfiler)
#> clusterProfiler v4.2.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
#> 
#> If you use clusterProfiler in published research, please cite:
#> T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
#> 
#> Attaching package: 'clusterProfiler'
#> The following object is masked from 'package:XVector':
#> 
#>     slice
#> The following object is masked from 'package:IRanges':
#> 
#>     slice
#> The following object is masked from 'package:S4Vectors':
#> 
#>     rename
#> The following object is masked from 'package:purrr':
#> 
#>     simplify
#> The following object is masked from 'package:stats':
#> 
#>     filter

all_genes = int_scaled_strains %>%
            as.data.frame() %>%
            rownames_to_column('ID') %>% 
            left_join(sc_identifiers,by=c('ID'='UNIPROT')) %>%
            dplyr::filter(!duplicated(GENENAME)) %>%
            mutate(GENENAME=if_na(GENENAME,ID)) %>% 
            as_tibble() %>% 
            dplyr::select(where(is.character))


genes_proteomics = all_genes$ORF
genes_dfe = dfe_lfc$ORF
genes_dfe_uni = dfe_lfc$ID

genes_dfe_list = df_dfe_annot %>% group_by(ORF) %>% summarize( mean_lfc = mean(EffectSize)) %>% arrange(desc(mean_lfc)) %>% pull(mean_lfc,ORF) 
genes_dfe_list_uni = df_dfe_annot %>% group_by(uniprot) %>% summarize( mean_lfc = mean(EffectSize)) %>% arrange(desc(mean_lfc)) %>% pull(mean_lfc,uniprot)


# ggo <- groupGO(gene     = gene,
#                OrgDb    = org.Sc.sgd.db,
#                ont      = "CC",
#                level    = 3,
#                readable = TRUE)
library(org.Sc.sgd.db)
#> Loading required package: AnnotationDbi
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view
#>     with 'browseVignettes()'. To cite Bioconductor,
#>     see 'citation("Biobase")', and for packages
#>     'citation("pkgname")'.
#> 
#> Attaching package: 'AnnotationDbi'
#> The following object is masked from 'package:clusterProfiler':
#> 
#>     select
#> The following object is masked from 'package:plotly':
#> 
#>     select
#> The following object is masked from 'package:dplyr':
#> 
#>     select
#> 
# ego <- enrichGO(gene          = genes_dfe,
#                 keyType       = "ORF",
#                 universe      = genes_proteomics,
#                 OrgDb         = org.Sc.sgd.db,
#                 ont           = "ALL",
#                 pAdjustMethod = "BH",
#                 pvalueCutoff  = 0.01,
#                 qvalueCutoff  = 0.05,
#                 readable      = F,
#                 pool=T)
# dotplot(ego)

ego3 <- gseGO(geneList     = genes_dfe_list,
              OrgDb        = org.Sc.sgd.db,
              keyType       = "ORF",
              ont          = "ALL",
              minGSSize    = 20,
              maxGSSize    = 500,
              pvalueCutoff = 0.05,
              verbose      = FALSE)
dotplot(ego3)
heatplot(ego3,foldChange = genes_dfe_list)

#kk <- enrichKEGG(gene         = gene,organism     = 'sce', pvalueCutoff = 0.05)
kk2 <- gseKEGG(geneList     = genes_dfe_list_uni,organism     = 'sce', minGSSize    = 20,keyType = 'uniprot',
               pvalueCutoff = 0.05,verbose      = FALSE)
#> Reading KEGG annotation online:
#> Reading KEGG annotation online:
#> 
#> Reading KEGG annotation online:
heatplot(kk2, foldChange=genes_dfe_list_uni)

mkk <- enrichMKEGG(gene = genes_dfe_uni,
                    organism = 'sce',
                    pvalueCutoff = 1,
                    qvalueCutoff = 1,
                    keyType = 'uniprot')
#> Reading KEGG annotation online:
#> 
#> Reading KEGG annotation online:
#> 
#> Reading KEGG annotation online:
dotplot(mkk)

8.8 Comparison of functional enrichment for clusters of differentially expressed genes

genes_dfe_cluster = split(dfe_lfc$ORF,cl_prot)

ck <- compareCluster(geneCluster = genes_dfe_cluster, fun = enrichGO,
                     keyType= "ORF",
                     ont='ALL',
                     universe=genes_proteomics,
                     OrgDb= org.Sc.sgd.db,
                     pAdjustMethod = "BH",
                     pvalueCutoff  = 0.01,
                     qvalueCutoff  = 0.05,pool=F)
dotplot(ck)